In this tutorial we'll demonstrate Coach's hierarchical RL support, by building a new agent that implements the Hierarchical Actor Critic (HAC) algorithm (https://arxiv.org/pdf/1712.00948.pdf), and a preset that runs the agent on Mujoco's pendulum challenge.

The Agent

First, some imports. Note that HAC is based on DDPG, hence we will be importing the relevant classes.



In [ ]:

    
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    sys.path.append(module_path + '/rl_coach')
    
from typing import Union
import numpy as np
from rl_coach.agents.ddpg_agent import DDPGAgent, DDPGAgentParameters, DDPGAlgorithmParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.core_types import RunPhase

Now let's define the HAC algorithm and agent parameters.

See tutorial 1 for more details on the content of each of these classes.



In [ ]:

    
class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
    def __init__(self):
        super().__init__()
        self.sub_goal_testing_rate = 0.5
        self.time_limit = 40


class HACDDPGAgentParameters(DDPGAgentParameters):
    def __init__(self):
        super().__init__()
        self.algorithm = DDPGAlgorithmParameters()

Now we'll define the agent itself - HACDDPGAgent - which subclasses the DDPG agent class. The main difference between the DDPG agent and the HACDDPGAgent is the subgoal a higher level agent defines to a lower level agent, hence the overrides of the DDPG Agent functions.



In [ ]:

    
class HACDDPGAgent(DDPGAgent):
    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
        super().__init__(agent_parameters, parent)
        self.sub_goal_testing_rate = self.ap.algorithm.sub_goal_testing_rate
        self.graph_manager = None

    def choose_action(self, curr_state):
        # top level decides, for each of his generated sub-goals, for all the layers beneath him if this is a sub-goal
        # testing phase
        graph_manager = self.parent_level_manager.parent_graph_manager
        if self.ap.is_a_highest_level_agent:
            graph_manager.should_test_current_sub_goal = np.random.rand() < self.sub_goal_testing_rate

        if self.phase == RunPhase.TRAIN:
            if graph_manager.should_test_current_sub_goal:
                self.exploration_policy.change_phase(RunPhase.TEST)
            else:
                self.exploration_policy.change_phase(self.phase)

        action_info = super().choose_action(curr_state)
        return action_info

    def update_transition_before_adding_to_replay_buffer(self, transition):
        graph_manager = self.parent_level_manager.parent_graph_manager

        # deal with goals given from a higher level agent
        if not self.ap.is_a_highest_level_agent:
            transition.state['desired_goal'] = self.current_hrl_goal
            transition.next_state['desired_goal'] = self.current_hrl_goal
            self.distance_from_goal.add_sample(self.spaces.goal.distance_from_goal(
                self.current_hrl_goal, transition.next_state))
            goal_reward, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
                self.current_hrl_goal, transition.next_state)
            transition.reward = goal_reward
            transition.game_over = transition.game_over or sub_goal_reached

        # each level tests its own generated sub goals
        if not self.ap.is_a_lowest_level_agent and graph_manager.should_test_current_sub_goal:
            _, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
                transition.action, transition.next_state)

            sub_goal_is_missed = not sub_goal_reached

            if sub_goal_is_missed:
                    transition.reward = -self.ap.algorithm.time_limit
        return transition

    def set_environment_parameters(self, spaces: SpacesDefinition):
        super().set_environment_parameters(spaces)

        if self.ap.is_a_highest_level_agent:
            # the rest of the levels already have an in_action_space set to be of type GoalsSpace, thus they will have
            # their GoalsSpace set to the in_action_space in agent.set_environment_parameters()
            self.spaces.goal = self.spaces.action
            self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])

        if not self.ap.is_a_highest_level_agent:
            self.spaces.reward.reward_success_threshold = self.spaces.goal.reward_type.goal_reaching_reward

The Preset

Defining the top agent in the hierarchy. Note that the agent's base parameters are the same as the DDPG agent's parameters. We also define here the memory, exploration policy and network topology.



In [ ]:

    
from rl_coach.architectures.tensorflow_components.layers import Dense
from rl_coach.base_parameters import VisualizationParameters, EmbeddingMergerType, EmbedderScheme
from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
from rl_coach.memories.episodic.episodic_hindsight_experience_replay import HindsightGoalSelectionMethod, \
    EpisodicHindsightExperienceReplayParameters
from rl_coach.memories.episodic.episodic_hrl_hindsight_experience_replay import \
    EpisodicHRLHindsightExperienceReplayParameters
from rl_coach.memories.memory import MemoryGranularity
from rl_coach.spaces import GoalsSpace, ReachingGoal
from rl_coach.exploration_policies.ou_process import OUProcessParameters
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps, RunPhase, TrainingSteps


time_limit = 1000
polar_coordinates = False
distance_from_goal_threshold = np.array([0.075, 0.075, 0.75])
goals_space = GoalsSpace('achieved_goal',
                         ReachingGoal(default_reward=-1, goal_reaching_reward=0,
                                      distance_from_goal_threshold=distance_from_goal_threshold),
                         lambda goal, state: np.abs(goal - state))  # raw L1 distance

top_agent_params = HACDDPGAgentParameters()

# memory - Hindsight Experience Replay
top_agent_params.memory = EpisodicHRLHindsightExperienceReplayParameters()
top_agent_params.memory.max_size = (MemoryGranularity.Transitions, 10000000)
top_agent_params.memory.hindsight_transitions_per_regular_transition = 3
top_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future
top_agent_params.memory.goals_space = goals_space
top_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(32)
top_agent_params.algorithm.num_consecutive_training_steps = 40
top_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)

# exploration - OU process
top_agent_params.exploration = OUProcessParameters()
top_agent_params.exploration.theta = 0.1

# actor - note that the default middleware is overriden with 3 dense layers
top_actor = top_agent_params.network_wrappers['actor']
top_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
                                        'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
top_actor.middleware_parameters.scheme = [Dense([64])] * 3
top_actor.learning_rate = 0.001
top_actor.batch_size = 4096

# critic - note that the default middleware is overriden with 3 dense layers
top_critic = top_agent_params.network_wrappers['critic']
top_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
                                         'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
                                         'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
top_critic.embedding_merger_type = EmbeddingMergerType.Concat
top_critic.middleware_parameters.scheme = [Dense([64])] * 3
top_critic.learning_rate = 0.001
top_critic.batch_size = 4096

The bottom agent



In [ ]:

    
from rl_coach.schedules import ConstantSchedule
from rl_coach.exploration_policies.e_greedy import EGreedyParameters


bottom_agent_params = HACDDPGAgentParameters()
bottom_agent_params.algorithm.in_action_space = goals_space

bottom_agent_params.memory = EpisodicHindsightExperienceReplayParameters()
bottom_agent_params.memory.max_size = (MemoryGranularity.Transitions, 12000000)
bottom_agent_params.memory.hindsight_transitions_per_regular_transition = 4
bottom_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future
bottom_agent_params.memory.goals_space = goals_space
bottom_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(16 * 25)  # 25 episodes is one true env episode
bottom_agent_params.algorithm.num_consecutive_training_steps = 40
bottom_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)

bottom_agent_params.exploration = EGreedyParameters()
bottom_agent_params.exploration.epsilon_schedule = ConstantSchedule(0.2)
bottom_agent_params.exploration.evaluation_epsilon = 0
bottom_agent_params.exploration.continuous_exploration_policy_parameters = OUProcessParameters()
bottom_agent_params.exploration.continuous_exploration_policy_parameters.theta = 0.1

# actor
bottom_actor = bottom_agent_params.network_wrappers['actor']
bottom_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
                                           'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
bottom_actor.middleware_parameters.scheme = [Dense([64])] * 3
bottom_actor.learning_rate = 0.001
bottom_actor.batch_size = 4096

# critic
bottom_critic = bottom_agent_params.network_wrappers['critic']
bottom_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
                                            'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
                                            'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
bottom_critic.embedding_merger_type = EmbeddingMergerType.Concat
bottom_critic.middleware_parameters.scheme = [Dense([64])] * 3
bottom_critic.learning_rate = 0.001
bottom_critic.batch_size = 4096

Now we define the parameters of all the agents in the hierarchy from top to bottom



In [ ]:

    
agents_params = [top_agent_params, bottom_agent_params]

Define the environment, visualization and schedule parameters. The schedule parameters refer to the top level agent.



In [ ]:

    
from rl_coach.environments.gym_environment import Mujoco
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod
from rl_coach.graph_managers.hrl_graph_manager import HRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters


env_params = Mujoco()
env_params.level = "rl_coach.environments.mujoco.pendulum_with_goals:PendulumWithGoals"
env_params.additional_simulator_parameters = {"time_limit": time_limit,
                                              "random_goals_instead_of_standing_goal": False,
                                              "polar_coordinates": polar_coordinates,
                                              "goal_reaching_thresholds": distance_from_goal_threshold}
env_params.frame_skip = 10
env_params.custom_reward_threshold = -time_limit + 1

vis_params = VisualizationParameters()
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST)]
vis_params.dump_mp4 = False
vis_params.native_rendering = False

schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentEpisodes(40 * 4 * 64)  # 40 epochs
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(4 * 64)  # 4 small batches of 64 episodes
schedule_params.evaluation_steps = EnvironmentEpisodes(64)
schedule_params.heatup_steps = EnvironmentSteps(0)

Lastly, we create a HRLGraphManager that will execute the hierarchical agent we defined according to the parameters.

Note that the bottom level agent will run 40 steps on each single step of the top level agent.



In [ ]:

    
graph_manager = HRLGraphManager(agents_params=agents_params, env_params=env_params,
                                schedule_params=schedule_params, vis_params=vis_params,
                                consecutive_steps_to_run_each_level=EnvironmentSteps(40))
graph_manager.visualization_parameters.render = True

Running the Preset



In [ ]:

    
from rl_coach.base_parameters import TaskParameters, Frameworks

log_path = '../experiments/pendulum_hac'
if not os.path.exists(log_path):
    os.makedirs(log_path)
    
task_parameters = TaskParameters(framework_type=Frameworks.tensorflow, 
                                evaluate_only=False,
                                experiment_path=log_path)

task_parameters.__dict__['checkpoint_save_secs'] = None
task_parameters.__dict__['verbosity'] = 'low'

graph_manager.create_graph(task_parameters)

graph_manager.improve()



In [ ]: